# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


# Path to NeMo repository
NEMO_PATH=NeMo

DATA_PATH="data_folder"

## data_folder_example
##   ├── tarred_data
##   |    └── (output)
##   ├── config.json
##   ├── label_map.txt
##   ├── semiotic_classes.txt
##   ├── test.tsv
##   ├── 1.tsv
##   ├── ...
##   └── 200.tsv

## Each of {1-200}.tsv input files are 110'000 examples subsets of all.tsv (except for validation part),
## generated by https://github.com/bene-ges/nemo_compatible/blob/main/scripts/nlp/en_spellmapper/dataset_preparation/build_training_data.sh
## Note that in this example we use 110'000 as input and only pack 100'000 of them to tar file. 
## This is because some input examples, e.g. too long, can be skipped during preprocessing, and we want all tar files to contain fixed equal number of examples.

for part in {1..200}
do
    python ${NEMO_PATH}/examples/nlp/spellchecking_asr_customization/create_tarred_dataset.py \
    lang="en" \
    data.train_ds.data_path=${DATA_PATH}/${part}.tsv \
    data.validation_ds.data_path=${DATA_PATH}/test.tsv \
    model.max_sequence_len=256 \
    model.language_model.pretrained_model_name=huawei-noah/TinyBERT_General_6L_768D \
    model.language_model.config_file=${DATA_PATH}/config.json \
    model.label_map=${DATA_PATH}/label_map.txt \
    model.semiotic_classes=${DATA_PATH}/semiotic_classes.txt \
    +output_tar_file=${DATA_PATH}/tarred_data/part${part}.tar \
    +take_first_n_lines=100000
done
